## R Markdown
# Load necessary libraries
library(readr)
## Warning: package 'readr' was built under R version 4.4.1
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(caret)
## Loading required package: lattice
library(shiny)
## Warning: package 'shiny' was built under R version 4.4.1
library(reshape2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.1
## corrplot 0.92 loaded
# Step 1: Load and Explore the Data
# Load the datasets
train_data <- read.csv(file.choose(), header = TRUE)
test_data <- read.csv(file.choose(), header = TRUE)
# View the first few rows of the datasets
head(train_data)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 1 7.2 0.34 0.34 12.6 0.048
## 2 2 6.0 0.27 0.28 4.8 0.063
## 3 3 6.9 0.26 0.49 1.6 0.058
## 4 4 6.6 0.25 0.34 3.0 0.054
## 5 5 7.1 0.17 0.43 1.3 0.023
## 6 6 6.0 0.29 0.25 1.4 0.033
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 7 41 0.99420 3.19 0.40 11.7 white
## 2 31 201 0.99640 3.69 0.71 10.0 white
## 3 39 166 0.99650 3.65 0.52 9.4 white
## 4 22 141 0.99338 3.26 0.47 10.4 white
## 5 33 132 0.99067 3.11 0.56 11.7 white
## 6 30 114 0.98794 3.08 0.43 13.2 white
## location quality
## 1 Texas 5
## 2 Texas 5
## 3 Texas 4
## 4 California 6
## 5 California 6
## 6 California 6
summary(train_data)
## ID fixed.acidity volatile.acidity citric.acid
## Min. : 1 Min. : 3.800 Min. :0.0800 Min. :0.0000
## 1st Qu.:1366 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500
## Median :2732 Median : 7.000 Median :0.2900 Median :0.3100
## Mean :2732 Mean : 7.218 Mean :0.3382 Mean :0.3185
## 3rd Qu.:4098 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900
## Max. :5463 Max. :15.900 Max. :1.5800 Max. :1.6600
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.60 Min. :0.00900 Min. : 1.00 Min. : 6.0
## 1st Qu.: 1.80 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 78.0
## Median : 3.00 Median :0.04700 Median : 29.00 Median :118.0
## Mean : 5.42 Mean :0.05613 Mean : 30.58 Mean :115.9
## 3rd Qu.: 8.10 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:155.0
## Max. :31.60 Max. :0.61100 Max. :289.00 Max. :440.0
## density pH sulphates alcohol
## Min. :0.9871 Min. :2.720 Min. :0.2200 Min. : 8.0
## 1st Qu.:0.9923 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.5
## Median :0.9949 Median :3.210 Median :0.5100 Median :10.3
## Mean :0.9947 Mean :3.217 Mean :0.5318 Mean :10.5
## 3rd Qu.:0.9969 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.3
## Max. :1.0103 Max. :4.010 Max. :2.0000 Max. :14.9
## type location quality
## Length:5463 Length:5463 Min. :3.000
## Class :character Class :character 1st Qu.:5.000
## Mode :character Mode :character Median :6.000
## Mean :5.823
## 3rd Qu.:6.000
## Max. :9.000
head(test_data)
## ID fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 5464 6.6 0.840 0.03 2.3 0.059
## 2 5465 7.2 0.540 0.27 2.6 0.084
## 3 5466 8.9 0.565 0.34 3.0 0.093
## 4 5467 6.7 0.130 0.32 3.7 0.017
## 5 5468 7.0 0.570 0.02 2.0 0.072
## 6 5469 7.0 0.170 0.31 4.8 0.034
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol type
## 1 32 48 0.99520 3.52 0.56 12.3 red
## 2 12 78 0.99640 3.39 0.71 11.0 red
## 3 16 112 0.99980 3.38 0.61 9.5 red
## 4 32 99 0.99348 3.12 0.44 10.0 white
## 5 17 26 0.99575 3.36 0.61 10.2 red
## 6 34 132 0.99440 3.36 0.48 9.6 white
## location
## 1 California
## 2 Texas
## 3 Texas
## 4 California
## 5 Texas
## 6 California
summary(test_data)
## ID fixed.acidity volatile.acidity citric.acid
## Min. :5464 Min. : 4.200 Min. :0.0850 Min. :0.0000
## 1st Qu.:5722 1st Qu.: 6.400 1st Qu.:0.2200 1st Qu.:0.2400
## Median :5980 Median : 7.000 Median :0.3000 Median :0.3100
## Mean :5980 Mean : 7.204 Mean :0.3474 Mean :0.3195
## 3rd Qu.:6239 3rd Qu.: 7.600 3rd Qu.:0.4300 3rd Qu.:0.3975
## Max. :6497 Max. :15.500 Max. :0.9650 Max. :1.2300
## residual.sugar chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. : 0.600 Min. :0.01200 Min. : 1.00 Min. : 7.00
## 1st Qu.: 1.900 1st Qu.:0.03800 1st Qu.: 16.00 1st Qu.: 75.25
## Median : 3.300 Median :0.04700 Median : 29.00 Median :118.00
## Mean : 5.566 Mean :0.05551 Mean : 30.23 Mean :115.01
## 3rd Qu.: 8.200 3rd Qu.:0.06475 3rd Qu.: 41.75 3rd Qu.:157.00
## Max. :65.800 Max. :0.41300 Max. :108.00 Max. :294.00
## density pH sulphates alcohol
## Min. :0.9875 Min. :2.800 Min. :0.2600 Min. : 8.50
## 1st Qu.:0.9924 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50
## Median :0.9950 Median :3.210 Median :0.5000 Median :10.30
## Mean :0.9948 Mean :3.225 Mean :0.5285 Mean :10.46
## 3rd Qu.:0.9972 3rd Qu.:3.330 3rd Qu.:0.6000 3rd Qu.:11.29
## Max. :1.0390 Max. :3.900 Max. :1.6200 Max. :14.20
## type location
## Length:1034 Length:1034
## Class :character Class :character
## Mode :character Mode :character
##
##
##
# Check for missing values
sum(is.na(train_data))
## [1] 0
sum(is.na(test_data))
## [1] 0
# Pair plot to visualize relationships between all numerical variables
ggpairs(train_data)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Histogram for each parameter
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]])) {
print(ggplot(train_data, aes_string(x = col)) +
geom_histogram(binwidth = 80, fill = 'blue', color = 'black') +
ggtitle(paste("Histogram of", col)))
}
}
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.













# Boxplot for each parameter to check for outliers
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]])) {
print(ggplot(train_data, aes_string(y = col)) +
geom_boxplot(fill = 'orange', color = 'black') +
ggtitle(paste("Boxplot of", col)))
}
}













# Scatter plots for each pair of numerical parameters
numeric_columns <- colnames(train_data)[sapply(train_data, is.numeric)]
for(i in 1:(length(numeric_columns)-1)) {
for(j in (i+1):length(numeric_columns)) {
print(ggplot(train_data, aes_string(x = numeric_columns[i], y = numeric_columns[j])) +
geom_point(color = 'green') +
geom_smooth(method = "lm", col = "orange") +
ggtitle(paste("Scatter plot of", numeric_columns[i], "and", numeric_columns[j])) +
labs(x = numeric_columns[i], y = numeric_columns[j]))
}
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

#Plot Training Data in respect to Wine Data
# Histogram for each parameter with respect to quality
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]]) && col != "quality") {
print(ggplot(train_data, aes_string(x = "quality", y = col)) +
geom_histogram(stat = "identity", fill = 'blue', color = 'yellow') +
ggtitle(paste("Histogram of", col, "with respect to Wine Quality")) +
labs(x = "Quality", y = col))
}
}
## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"): Ignoring unknown parameters: `binwidth`, `bins`, and `pad`
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`

## Warning in geom_histogram(stat = "identity", fill = "blue", color = "yellow"):
## Ignoring unknown parameters: `binwidth`, `bins`, and `pad`


# Boxplot for each parameter with respect to quality
for(col in colnames(train_data)) {
if(is.numeric(train_data[[col]]) && col != "quality") {
print(ggplot(train_data, aes_string(x = "quality", y = col)) +
geom_boxplot(fill = 'orange', color = 'blue') +
ggtitle(paste("Boxplot of", col, "with respect to Wine Quality")) +
labs(x = "Quality", y = col))
}
}
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

# Scatter plots for each numerical parameter with respect to quality
numeric_columns <- colnames(train_data)[sapply(train_data, is.numeric)]
for(col in numeric_columns) {
if(col != "quality") {
print(ggplot(train_data, aes_string(x = "quality", y = col)) +
geom_point(color = 'green') +
geom_smooth(method = "lm", col = "purple") +
ggtitle(paste("Scatter plot of", col, "with respect to Wine Quality")) +
labs(x = "Quality", y = col))
}
}
## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

## `geom_smooth()` using formula = 'y ~ x'

# Step 2: Preprocess the Data
# Check the structure of the training data
str(train_data)
## 'data.frame': 5463 obs. of 15 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ fixed.acidity : num 7.2 6 6.9 6.6 7.1 6 7.2 6.8 9.1 7.8 ...
## $ volatile.acidity : num 0.34 0.27 0.26 0.25 0.17 0.29 0.57 0.45 0.27 0.32 ...
## $ citric.acid : num 0.34 0.28 0.49 0.34 0.43 0.25 0.06 0.3 0.32 0.33 ...
## $ residual.sugar : num 12.6 4.8 1.6 3 1.3 1.4 1.6 11.8 1.1 10.4 ...
## $ chlorides : num 0.048 0.063 0.058 0.054 0.023 0.033 0.076 0.094 0.031 0.031 ...
## $ free.sulfur.dioxide : num 7 31 39 22 33 30 9 23 15 47 ...
## $ total.sulfur.dioxide: num 41 201 166 141 132 114 27 97 151 194 ...
## $ density : num 0.994 0.996 0.997 0.993 0.991 ...
## $ pH : num 3.19 3.69 3.65 3.26 3.11 3.08 3.36 3.09 3.03 3.07 ...
## $ sulphates : num 0.4 0.71 0.52 0.47 0.56 0.43 0.7 0.44 0.41 0.58 ...
## $ alcohol : num 11.7 10 9.4 10.4 11.7 13.2 9.6 9.6 10.6 9.6 ...
## $ type : chr "white" "white" "white" "white" ...
## $ location : chr "Texas" "Texas" "Texas" "California" ...
## $ quality : int 5 5 4 6 6 6 6 5 5 6 ...
# Check for missing values and handle them if any
train_data <- na.omit(train_data)
# Split the train data into features and labels
x_train <- train_data[, -which(colnames(train_data) == "quality")]
y_train <- train_data$quality
# Ensure the test data has the same columns as the train predictors
test_predictors <- test_data[, colnames(x_train)]
# Step 3: Build and Train a Linear Regression Model
lm_model <- lm(quality ~ ., data = train_data)
# Predict on the training set using linear regression
val_predictions_lm <- predict(lm_model, newdata = train_data)
# Step 4: Evaluate the Linear Regression Model
# Function to calculate Mean Absolute Error
calculate_mae <- function(actual, predicted) {
n <- length(actual)
mae <- sum(abs(actual - predicted)) / n
return(mae)
}
# Calculate MAE for Linear Regression
val_mae_lm <- calculate_mae(y_train, val_predictions_lm)
print(paste("Validation MAE for Linear Regression:", val_mae_lm))
## [1] "Validation MAE for Linear Regression: 0.500030518564883"
# Step 5: Analyze the Data for Insights
# Define the features of interest
features_of_interest <- c("fixed.acidity", "volatile.acidity", "residual.sugar", "pH", "alcohol", "quality")
# Subset the data to include only these features
subset_data <- train_data %>% select(all_of(features_of_interest))
# Correlation matrix for the selected features
cor_matrix <- cor(subset_data)
corrplot(cor_matrix, method = "circle", type = "upper", tl.col = "black", tl.srt = 45, title = "Correlation Matrix")

# Function to create scatter plots
create_scatter_plot <- function(data, x, y) {
ggplot(data, aes_string(x = x, y = y)) +
geom_point(alpha = 0.5) +
geom_smooth(method = "lm", col = "purple") +
theme_minimal() +
labs(title = paste("Scatter plot of", x, "vs", y),
x = x, y = y)
}
# Function to create box plots
create_box_plot <- function(data, x, y) {
ggplot(data, aes_string(x = x, y = y)) +
geom_boxplot() +
theme_minimal() +
labs(title = paste("Box plot of", x, "vs", y),
x = x, y = y)
}
# Scatter plots to investigate relationships between features and quality
scatter_fixed_acidity <- create_scatter_plot(subset_data, "fixed.acidity", "quality")
scatter_volatile_acidity <- create_scatter_plot(subset_data, "volatile.acidity", "quality")
scatter_residual_sugar <- create_scatter_plot(subset_data, "residual.sugar", "quality")
scatter_pH <- create_scatter_plot(subset_data, "pH", "quality")
scatter_alcohol <- create_scatter_plot(subset_data, "alcohol", "quality")
# Box plots to investigate relationships between features and quality
box_fixed_acidity <- create_box_plot(subset_data, "quality", "fixed.acidity")
box_volatile_acidity <- create_box_plot(subset_data, "quality", "volatile.acidity")
box_residual_sugar <- create_box_plot(subset_data, "quality", "residual.sugar")
box_pH <- create_box_plot(subset_data, "quality", "pH")
box_alcohol <- create_box_plot(subset_data, "quality", "alcohol")
# Plot the scatter plots
print(scatter_fixed_acidity)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_volatile_acidity)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_residual_sugar)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_pH)
## `geom_smooth()` using formula = 'y ~ x'

print(scatter_alcohol)
## `geom_smooth()` using formula = 'y ~ x'

# Plot the box plots
print(box_fixed_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_volatile_acidity)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_residual_sugar)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_pH)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

print(box_alcohol)
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

# Step 6: Predict the quality for the test data using the linear regression model
test_predictions <- predict(lm_model, newdata = test_predictors)
# Add the predictions to the test data
test_data$quality <- test_predictions
# Step 7: Extract coefficients from the linear regression model
lm_coefficients <- summary(lm_model)$coefficients
# Step 8: Create a data frame for feature importance based on the absolute values of the coefficients
feature_importance <- data.frame(
Feature = rownames(lm_coefficients)[-1], # Exclude the intercept
Importance = abs(lm_coefficients[-1, "Estimate"]) # Take the absolute value of the coefficients
)
# Step 9: Order the data by importance
# Define colors based on importance (high importance in red)
feature_importance$color <- ifelse(feature_importance$Importance > median(feature_importance$Importance), "red", "gray")
# Step 10: Plotting the feature importance
ggplot(feature_importance, aes(x = reorder(Feature, Importance), y = Importance, fill = color)) +
geom_bar(stat = "identity", color = "black", show.legend = FALSE) +
scale_fill_manual(values = c("gray" = "gray", "red" = "red")) +
coord_flip() +
theme_minimal() +
labs(title = "Feature Importance in Wine Quality Prediction (Linear Regression)",
x = "Wine Features",
y = "Importance Score")

# Prepare the final dataframe with ID and predicted quality
final_results <- test_data %>% select(ID, quality)
# Save the predictions to a CSV file
write.csv(final_results, "Wine_Quality_PredictionsLR.csv")